########################################################################
######### Converts original monadic data into vertex covariates ########
########################################################################

### Monadic covariates: democracy, human rights, economic freedom, 
#### GDP, population, military spending, CINC score, nuclear weapons,
#### region, IGO headquarter

########################################################################

### Accomplishes 3 tasks:
#### 1) Deals with Cold War name changes
#### 2) When needed, inserts COW-character country code
#### 3) Creates a vertex covariate file ("vcov.csv")

########################################################################
# Install packages
install.packages("countrycode")
install.packages("plyr")
########################################################################

# Set the working directory
setwd("~/Dropbox/ISQ_ms/5 Final/Replication")

# Set SAVEDIR
SAVEDIR <- "~/Dropbox/ISQ_ms/5 Final/Replication"

# load libraries
library(countrycode)
library(plyr)

# read in state IDs data; extract IDs for each year
stateIDs <- read.csv("stateIDs.csv", stringsAsFactors=F)

stateIDs1970 <- stateIDs$stateabb[stateIDs$year==1970]
stateIDs1975 <- stateIDs$stateabb[stateIDs$year==1975]
stateIDs1980 <- stateIDs$stateabb[stateIDs$year==1980]
stateIDs1985 <- stateIDs$stateabb[stateIDs$year==1985]
stateIDs1990 <- stateIDs$stateabb[stateIDs$year==1990]
stateIDs1995 <- stateIDs$stateabb[stateIDs$year==1995]
stateIDs2000 <- stateIDs$stateabb[stateIDs$year==2000]
stateIDs2005 <- stateIDs$stateabb[stateIDs$year==2005]
stateIDs2010 <- stateIDs$stateabb[stateIDs$year==2010]

# set period of interest
myyears <- seq(from=1970, to=2010, by=5)


########################################################################
################## Democracy: Modified Polity IV data ##################
########################################################################

# Read in original data 
polity <- read.table("http://privatewww.essex.ac.uk/~ksg/data/ksgp4use.asc", header = TRUE)
names(polity)

# code transitions etc as NAs
polity$POLITY[polity$POLITY < -10] <- NA

# extract data from the period of interest
polity <- polity[polity$YEAR %in% myyears,]


### 1) Deal with name changes

# check Czechoslovakia etc
polity[polity$CCODE=="315",] # ok only until 1992
polity[polity$CCODE=="316",] # ok only starts in 1993
polity[polity$CCODE=="317",] # ok only starts in 1993

# check Germany 
polity[polity$CCODE=="260",]
# Gleditsch codes Germany after unification as a continution of GFR (260)
# change ccode to 255 from 1990 on
polity$CCODE[polity$YEAR >= 1990 & polity$SCODE=="GFR"] <- 255
polity[polity$SCODE=="GFR",]

# check Russia
polity[polity$SCODE=="RUS",] # ok

# check Yemen
polity[polity$CCODE=="678",] 
# Gleditsch codes Yemen after unification (in 1990) as a continuation of YAR, but uses the scode for YEM
# change ccode to 679 (YEM) from 1990 on
polity$CCODE[polity$YEAR >= 1990 & polity$SCODE=="YEM"] <- 679
polity[polity$SCODE=="YEM",] 

# check Yugoslavia
polity[polity$SCODE=="YUG",] # ok


### 2) Insert COW-character country code

polity$stateabb <- countrycode(polity$CCODE, "cown", "cowc")
table(is.na(polity$stateabb))


#### 3) Extract variables of interest

polity <- polity[ , c("CCODE", "stateabb", "POLITY", "YEAR")]
colnames(polity) <- c("ccode", "stateabb", "polity", "year")

# add democracy dummy (Polity IV > 6)
polity$dem_dum <- NA
polity$dem_dum[polity$polity > 6] <- 1
polity$dem_dum[polity$polity >= -10 & polity$polity <= 6] <- 0

# check data by year
tapply(polity$polity, polity$year, summary)
tapply(polity$dem_dum, polity$year, summary)


#### 4) Join into vcov data frame

# check polity coverage
# states in polity not in stateIDs: "ZIM" "OMA" "FIJ" 
unique(polity$stateabb[polity$year==1970])[!unique(polity$stateabb[polity$year==1970]) %in% stateIDs1970]
# states in stateIDs not in polity: "WSM"
stateIDs1970[!stateIDs1970 %in% unique(polity$stateabb[polity$year==1970])] 

# join
vcov <- stateIDs
vcov <- join(vcov, polity, by=c("stateabb", "ccode", "year"), type="left")
vcov[is.na(vcov$polity),]


########################################################################
######## Human rights and econ liberalism: Modified QoG IV data ########
########################################################################

# Read in original data 
qog <- read.csv("qog_std_ts_jan15.csv", stringsAsFactors=F)


#### 1) Extract variables of interest

# extract years of interest
qog <- qog[qog$year %in% myyears,]

# define and extract variables of interest
myvars <- c("ccodecow", "cname", "year", "gd_ptss", "hf_efiscore")
qog <- qog[names(qog) %in% myvars]

# rename ccodecow variable
names(qog)[3] <- "ccode"


### 2) Deal with name changes

# check names
table(qog$cname)
table(is.na(qog$ccode))

# check country names
qog[is.na(qog$ccode),]

# fix Ethiopia
qog[qog$cname=="Ethiopia (-1992)",] # only NA's; need to fix 
qog$ccode[qog$cname=="Ethiopia (-1992)" & qog$year < 1995] <- 530
qog[qog$cname=="Ethiopia (1993-)",] 
qog$ccode[qog$cname=="Ethiopia (1993-)" & qog$year < 1995] <- NA

# now only USSR and Serbia/Serbia and Montenegro lack codes for years when there's data

# check Russia
qog[which(qog$cname=="USSR"),] # only before 1990, but ccode missing
qog$ccode[qog$cname=="USSR" & qog$year < 1995] <- 365
qog[which(qog$cname=="Russia"),] # only after 1995; need to remove ccode before 1995
qog$ccode[qog$cname=="Russia" & qog$year < 1995] <- NA

# check Yugoslavia
qog[which(qog$ccode=="345"),] # 1976-1991
qog$ccode[qog$cname=="Yugoslavia" & qog$year > 1990] <- NA
qog$ccode[qog$cname=="Serbia and Montenegro" & qog$year > 1990] <- 345

# drop missing values for ccode
qog <- qog[!is.na(qog$ccode),]
table(is.na(qog$ccode))

# insert stateabb var
qog$stateabb <- countrycode(qog$ccode, "cown", "cowc")


### 3) Invert PTS scale
qog$pts_inv <- qog$gd_ptss
qog$pts_inv[qog$gd_ptss==1] <- 5
qog$pts_inv[qog$gd_ptss==2] <- 4
qog$pts_inv[qog$gd_ptss==4] <- 2
qog$pts_inv[qog$gd_ptss==5] <- 1

# select vars of interest
myvars <- c("stateabb", "ccode", "year", "pts_inv", "hf_efiscore")
qog <- qog[names(qog) %in% myvars]

# add dummies

# add PTS dummy (pts_dum <- pts_inv==5)
table(qog$pts_inv)
qog$pts_dum <- NA
qog$pts_dum[qog$pts_inv==5] <- 1
qog$pts_dum[qog$pts_inv<5] <- 0
table(qog$pts_dum)

# add IEF dummy (75 quantile as cutoff)
tapply(qog$hf_efiscore, qog$year, summary)
qog$ief_dum <- NA
qog$ief_dum[qog$hf_efiscore>=quantile(qog$hf_efiscore, c(.75), na.rm=T)[[1]]] <- 1
qog$ief_dum[qog$hf_efiscore<quantile(qog$hf_efiscore, c(.75), na.rm=T)[[1]]] <- 0
tapply(qog$ief_dum, qog$year, table)

# check data by year
tapply(qog$pts_inv, qog$year, summary)
tapply(qog$pts_dum, qog$year, summary)
tapply(qog$hf_efiscore, qog$year, summary)
tapply(qog$ief_dum, qog$year, summary)


#### 4) Join into vcov data frame

# check qog coverage
# states in stateIDs not in qog
stateIDs1980[!stateIDs1980 %in% unique(qog$stateabb[qog$year==1980])] # ok none
stateIDs1995[!stateIDs1995 %in% unique(qog$stateabb[qog$year==1995])] # ok none
stateIDs2010[!stateIDs2010 %in% unique(qog$stateabb[qog$year==2010])] # "KOS"

# join
vcov <- join(vcov, qog, by=c("stateabb", "ccode", "year"), type="left")
vcov[is.na(vcov$pts_inv),]
vcov[is.na(vcov$hf_efiscore),]


########################################################################
######################## GDP: Gleditsch (2002) #########################
########################################################################

# Read in original data 
gdp <- read.table("gdpv6.txt", header = TRUE)
names(gdp)

# check years
table(gdp$year)

# extract data of interest from the period of interest
gdp <- gdp[gdp$year %in% myyears, 1:6]

### 1) Deal with name changes

# check Czechoslovakia etc
gdp[gdp$statenum=="315",] # ok only until 1992
gdp[gdp$statenum=="316",] # ok only starts in 1993
gdp[gdp$statenum=="317",] # ok only starts in 1993

# check Germany 
gdp[gdp$statenum=="260",]
# Gleditsch codes Germany after unification as a continution of GFR (260)
# change ccode to 255 from 1991 on
gdp$statenum[gdp$year >= 1991 & gdp$stateid=="GFR"] <- 255
gdp[gdp$stateid=="GFR",]
gdp[gdp$statenum=="265",] # ok

# check Russia
gdp[gdp$stateid=="RUS",] # ok

# check Yemen
gdp[gdp$statenum=="678",] 
# Gleditsch codes Yemen after unification (in 1990) as a continuation of YAR, but uses the scode for YEM
# change ccode to 679 (YEM) from 1991 on
gdp$statenum[gdp$year >= 1991 & gdp$stateid=="YEM"] <- 679
gdp[gdp$stateid=="YEM",] 
gdp[gdp$statenum=="680",] # ok

# check Yugoslavia
gdp[gdp$stateid=="YUG",] # ok, only until 2006
gdp[gdp$stateid=="SER",] # 2006-2011 
gdp[gdp$stateid=="BOS",] # ok starts 1992
gdp[gdp$stateid=="CRO",] # ok starts 1991
gdp[gdp$stateid=="SLV",] # ok starts 1992


### 2) Insert COW-character country code

gdp$stateabb <- countrycode(gdp$statenum, "cown", "cowc")
gdp[is.na(gdp$stateabb),] # ok, it's NAU, TON and TUV (will drop because of missing data)
gdp <- gdp[!is.na(gdp$stateabb),]


#### 3) Extract variables of interest

gdp <- gdp[, c("statenum", "stateabb", "pop", "realgdp", "rgdppc", "year")]
colnames(gdp) <- c("ccode", "stateabb", "pop", "realgdp", "rgdppc", "year")

# Check for missing values
gdp[is.na(gdp$pop),] # no NAs
gdp[is.na(gdp$realgdp),] # no NAs
gdp[is.na(gdp$rgdppc),] # no NAs

# Log-transform vars
gdp$log_pop <- log(gdp$pop)
gdp$log_realgdp <- log(gdp$realgdp)
gdp$log_rgdppc <- log(gdp$rgdppc)

# check data by year
tapply(gdp$log_pop, gdp$year, summary)
tapply(gdp$log_realgdp, gdp$year, summary)
tapply(gdp$log_rgdppc, gdp$year, summary)


#### 4) Join into vcov data frame

# check gdp coverage
# states in stateIDs not in gdp
stateIDs1980[!stateIDs1980 %in% unique(gdp$stateabb[gdp$year==1980])] # BRU
stateIDs1985[!stateIDs1985 %in% unique(gdp$stateabb[gdp$year==1985])] # NAM
stateIDs2000[!stateIDs2000 %in% unique(gdp$stateabb[gdp$year==2000])] # KIR "TUV" "TON"


# join
vcov <- join(vcov, gdp, by=c("stateabb", "ccode", "year"), type="left")
vcov[is.na(vcov$log_pop),]
vcov[is.na(vcov$log_realgdp),]
vcov[is.na(vcov$log_rgdppc),]


########################################################################
############### Material capabilities: COW NMC data  ###################
########################################################################

# Read in original data
nmc <- read.csv("NMC_v4_0.csv",stringsAsFactors=F)
names(nmc)

# define period and variables of interest

table(nmc$year)
nmc <- nmc[nmc$year %in% myyears,]

myvars <- c("stateabb", "ccode", "year", "milex", "milper", "cinc")
nmc <- nmc[names(nmc) %in% myvars]

# Check for missing values (-9 in the original data)

summary(nmc$milex)
nmc$milex[nmc$milex==-9] <- NA

summary(nmc$milper)
nmc$milper[nmc$milper==-9] <- NA

summary(nmc$cinc) # ok none

# log transform variables
# milex: add 1 because min=0 and log(0)=Inf
nmc$log_milex <- log(nmc$milex+1)
# milper: ditto
nmc$log_milper <- log(nmc$milper+1)
# cinc: transform scale into positive numbers, with higher log scores indicating higher cinc scores
nmc$log_cinc <- 1/(-1*log(nmc$cinc))

# check data by year
tapply(nmc$log_milex, nmc$year, summary)
tapply(nmc$log_milper, nmc$year, summary)
tapply(nmc$log_cinc, nmc$year, summary)


#### 1) Join into vcov data frame

# check nmc coverage
# states in nmc not in stateIDs: "ZIM" "FIJ"
unique(nmc$stateabb[nmc$year==1970])[!unique(nmc$stateabb[nmc$year==1970]) %in% stateIDs1970]
# states in stateIDs not in nmc: "BHU" "WSM"
stateIDs1970[!stateIDs1970 %in% unique(nmc$stateabb[nmc$year==1970])] 

# join
vcov <- join(vcov, nmc, by=c("stateabb", "ccode", "year"), type="left")
tapply(vcov$log_milex, vcov$year, summary)
tapply(vcov$log_milper, vcov$year, summary)
tapply(vcov$log_cinc, vcov$year, summary)


########################################################################
############## Nuclear weapons: Singh and Way (2004) data ##############
########################################################################

# Read in original data 
nw <- read.csv("nuclear_weapons.csv", stringsAsFactors=F)
names(nw)

# extract years of interest 
table(nw$year)
nw <- nw[nw$year %in% myyears,]

# extract variables of interest
myvars <- c("cowcc", "year", "level") # level records whether country has NWs and an ongoing program
nw <- nw[names(nw) %in% myvars]


### 1) Insert COW-character country code

table(nw$cowcc)
table(is.na(nw$cowcc)) # no NAs

# rename cowcc variable
names(nw)[1] <- "ccode"
nw$stateabb <- countrycode(nw$ccode, "cown", "cowc") 
# 441 and 472 not cow ccodes; remove them
nw <- nw[!is.na(nw$stateabb),]


### 2) Deal with name changes

# check names
table(nw$stateabb)

# check Czechoslovakia etc
nw[nw$ccode=="315",] # ok only until 1990
nw[nw$ccode=="316",] # ok only starts in 1995
nw[nw$ccode=="317",] # ok only starts in 1995

# check Germany 
nw[nw$ccode=="260",]
# codes Germany after unification as a continution of GFR (260)
# change ccode to 255 from 1990 on
nw$ccode[nw$year >= 1990 & nw$stateabb=="GFR"] <- 255
nw$stateabb[nw$year >= 1990 & nw$ccode==255] <- "GMY"
nw[nw$ccode=="260",]
nw[nw$ccode=="255",]

# check Russia
nw[nw$stateabb=="RUS",] # ok

# check Yemen
nw[nw$ccode=="678",] 
# codes Yemen after unification (in 1990) as a continuation of YAR
# change ccode to 679 (YEM) from 1990 on
nw$ccode[nw$year >= 1990 & nw$stateabb=="YAR"] <- 679
nw$stateabb[nw$year >= 1990 & nw$ccode==679] <- "YEM"
nw[nw$stateabb=="YAR",] 
nw[nw$stateabb=="YEM",] 

# check Yugoslavia
nw[nw$stateabb=="YUG",] # ok
nw[nw$stateabb=="SER",] # ok none
nw[nw$stateabb=="BOS",] # ok only starts in 1995
nw[nw$stateabb=="CRO",] # ok only starts in 1995
nw[nw$stateabb=="SLV",] # ok only starts in 1995
nw[nw$stateabb=="MAC",] # ok only starts in 1995
nw[nw$stateabb=="MNG",] # ok none


#### 3) Extract variable of interest

table(is.na(nw$level)) # will consider NAs as zeros (no nw)
# "We employ new data on key decisions along the path to nuclear weapons. Conceptually, we argue for viewing 
# proliferation as a continuum instead of a dichotomy, defining four stages of proliferation: no noticeable 
# interest in nuclear weapons [0], serious exploration of the weapons option [1], launch of a weapons program
# [2], and acquisition of nuclear weapons [3]" (Singh and Way 2004, 861). 

nw$nw <- 0
nw$nw[nw$level==3] <- 1
table(nw$nw)
nw[nw$nw==1,]

# drop nw$level
nw <- nw[, -3]

# check data by year
tapply(nw$nw, nw$year, table)
tapply(nw$nw, nw$year, summary)

# extrapolate 2000 values to 2005
nw2005 <- nw[nw$year==2000,]
nw2005$year <- 2005
nw <- rbind(nw, nw2005)

#### 4) Join into vcov data frame

# check nw coverage
stateIDs1970[!stateIDs1970 %in% unique(nw$stateabb[nw$year==1970])] 
unique(nw$stateabb[nw$year==1970])[!unique(nw$stateabb[nw$year==1970]) %in% stateIDs1970]
stateIDs1985[!stateIDs1985 %in% unique(nw$stateabb[nw$year==1985])] # "NAM"
stateIDs1990[!stateIDs1990 %in% unique(nw$stateabb[nw$year==1990])] # "GFR" "YAR" "MSI" "FSM"
stateIDs2000[!stateIDs2000 %in% unique(nw$stateabb[nw$year==2000])] # "KIR" "TUV" "TON" "NAU" "MSI" "FSM"

# join
vcov <- join(vcov, nw, by=c("stateabb", "ccode", "year"), type="left")
tapply(vcov$nw, vcov$year, table)
tapply(vcov$nw, vcov$year, summary)


########################################################################
################ Region: UN Statistics Division data ###################
########################################################################

# read in orgiginal data
region <- read.csv("region.csv", stringsAsFactors=F)

# insert COW numeric code
region$ccode <- countrycode(region$country_un, "un", "cown")
table(is.na(region$ccode))
region[is.na(region$ccode),] # ok, these are not states (except Serbia)

# insert cold war countries and Taiwan

gfr <- region[region$country_name_un=="Germany",]
gfr$country_un <- NA
gfr$country_name_un <- NA
gfr$ccode <- "260"

gdr <- region[region$country_name_un=="Germany",]
gdr$country_un <- NA
gdr$country_name_un <- NA
gdr$ccode <- "265"

cze <- region[region$country_name_un=="Czech Republic",]
cze$country_un <- NA
cze$country_name_un <- NA
cze$ccode <- "315"

yug <- region[region$country_name_un=="Serbia",]
yug$country_un <- NA
yug$country_name_un <- NA
yug$ccode <- "345"

yar <- region[region$country_name_un=="Yemen",]
yar$country_un <- NA
yar$country_name_un <- NA
yar$ccode <- "678"

ypr <- region[region$country_name_un=="Yemen",]
ypr$country_un <- NA
ypr$country_name_un <- NA
ypr$ccode <- "680"

taw <- region[region$country_name_un=="China",]
taw$country_un <- NA
taw$country_name_un <- NA
taw$ccode <- "713"

rvn <- region[region$country_name_un=="Viet Nam",]
rvn$country_un <- NA
rvn$country_name_un <- NA
rvn$ccode <- "817"

# join non-un states
non_un <- rbind(cze, gdr, gfr, rvn, taw, yar, ypr, yug)
region <- rbind(region, non_un)

# ioin into vcov data frame
vcov <- join(vcov, region[,3:7], by="ccode", type="left")
vcov[is.na(vcov$sub_region_n),] 


########################################################################
############### IGO headquaters: The World Trade Index  ################
########################################################################
# To obtain data, go to http://www.worldtreatyindex.com/
# Search by title keyword "headquarters" (120 obs returned)
########################################################################

# Read in original data 
hq <- read.csv("igo_headquarters.csv",stringsAsFactors=F)
names(hq)

# Remove treaties between 2 countries 
hq <- hq[is.na(hq$Party.1.COW) | is.na(hq$Party.2.COW),]

# Identify IGO headquarters for every year

### For 1970 
hq1970 <- hq[hq$Sign.Date<=1970,]
hq1970 <- data.frame(sort(unique(c(hq1970$Party.1.COW, hq1970$Party.2.COW)))) # 17 states
colnames(hq1970) <- "ccode"
hq1970$igo_hq <- 1
hq1970$year <- 1970

### For 1975 
hq1975 <- hq[hq$Sign.Date<=1975,]
hq1975 <- data.frame(sort(unique(c(hq1975$Party.1.COW, hq1975$Party.2.COW)))) # 21 states
colnames(hq1975) <- "ccode"
hq1975$igo_hq <- 1
hq1975$year <- 1975

### For 1980 
hq1980 <- hq[hq$Sign.Date<=1980,]
hq1980 <- data.frame(sort(unique(c(hq1980$Party.1.COW, hq1980$Party.2.COW)))) # 25 states
colnames(hq1980) <- "ccode"
hq1980$igo_hq <- 1
hq1980$year <- 1980

### For 1985 
hq1985 <- hq[hq$Sign.Date<=1985,]
hq1985 <- data.frame(sort(unique(c(hq1985$Party.1.COW, hq1985$Party.2.COW)))) # 28 states
colnames(hq1985) <- "ccode"
hq1985$igo_hq <- 1
hq1985$year <- 1985

### For 1990 
hq1990 <- hq[hq$Sign.Date<=1990,]
hq1990 <- data.frame(sort(unique(c(hq1990$Party.1.COW, hq1990$Party.2.COW)))) # 30 states
colnames(hq1990) <- "ccode"
hq1990$igo_hq <- 1
hq1990$year <- 1990

### For 1995 
hq1995 <- hq[hq$Sign.Date<=1995,]
hq1995 <- data.frame(sort(unique(c(hq1995$Party.1.COW, hq1995$Party.2.COW)))) # 35 states
colnames(hq1995) <- "ccode"
hq1995$igo_hq <- 1
hq1995$year <- 1995

### For 2000 
hq2000 <- hq[hq$Sign.Date<=2000,]
hq2000 <- data.frame(sort(unique(c(hq2000$Party.1.COW, hq2000$Party.2.COW)))) # 41 states
colnames(hq2000) <- "ccode"
hq2000$igo_hq <- 1
hq2000$year <- 2000

### For 2005
hq2005 <- hq[hq$Sign.Date<=2005,]
hq2005 <- data.frame(sort(unique(c(hq2005$Party.1.COW, hq2005$Party.2.COW)))) # 41 states
colnames(hq2005) <- "ccode"
hq2005$igo_hq <- 1
hq2005$year <- 2005

# join all years
hq <- rbind(hq1970, hq1975, hq1980, hq1985, hq1990, hq1995, hq2000, hq2005)

# join with vcov
vcov <- join(vcov, hq, by=c("ccode", "year"), type="left")
vcov$igo_hq[is.na(vcov$igo_hq)] <- 0
table(is.na(vcov$igo_hq)) # no NAs


# save
write.csv(vcov, file=file.path(SAVEDIR, "vcov.csv"), row.names=F)
